In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import matplotlib.dates as mdates
%matplotlib inline
In [2]:
df = pd.read_csv('~/development/vinepair_time_series.csv')
pub_dates = pd.read_csv('~/development/slugs_dates_vinepair_posts.csv',parse_dates=True)
In [3]:
df.head(1)
Out[3]:
In [4]:
pub_dates.head(1)
Out[4]:
In [5]:
pub_dates.dtypes
Out[5]:
In [6]:
pub_dates.Date.dtype
pub_dates.columns = ["Url","Published"]
In [7]:
pub_dates.Published = pd.to_datetime(pub_dates.Published)
In [8]:
pub_dates.Published.dtype
Out[8]:
In [9]:
pub_dates.head(1)
Out[9]:
In [10]:
df = df.drop('Unnamed: 0',axis=1)
df = df.T
df.head(2)
Out[10]:
In [11]:
df = df.drop(df.columns[367:1168],axis='columns')
In [15]:
df['days_to_90p'] = [(df.iloc[x].expanding().sum() > df.iloc[x].sum()*.90).argmax()\
for x in range(len(df))]
In [16]:
df['total'] = df.sum(axis=1)
In [17]:
pub_dates = pub_dates.set_index('Url')
df = df.join(pub_dates)
df = df.reset_index()
In [18]:
df.iloc[0]
Out[18]:
In [19]:
df.total.describe()
Out[19]:
In [22]:
df = df[df.total > 200]
In [23]:
df.total.describe()
Out[23]:
In [24]:
df.days_to_90p.describe()
Out[24]:
In [25]:
ax = df.plot(x=df.index, y='days_to_90p',kind='density')
ax.set(xlabel='Days Since Publication',title = 'Density Distribution of Days until 90% of Traffic Achieved')
Out[25]:
In [26]:
df['month_published'] = pd.DatetimeIndex(df.Published).month
df['year_published'] = pd.DatetimeIndex(df.Published).year
df['month_year_published'] = df.month_published.astype(str) + "-" + df.year_published.astype(str)
In [27]:
sns.boxplot(y='days_to_90p',x='month_year_published',data=df.sort_values('month_year_published',ascending=True))
Out[27]:
In [28]:
df.month_published.head(1)
Out[28]:
In [29]:
len(df)
Out[29]:
In [30]:
len(df.columns)
Out[30]:
In [31]:
df.plot(x=df.index,y='total',kind='density')
Out[31]:
In [32]:
df[df.days_to_90p > 150]
Out[32]:
In [33]:
#I want a distribution of days-post-publication and PVs for the two modes.
In [34]:
fast = df[df.days_to_90p < 50]
slow = df[df.days_to_90p > 50]
In [35]:
fast[['days_to_90p','total']].describe()
Out[35]:
In [36]:
slow[['days_to_90p','total']].describe()
Out[36]:
In [37]:
fast.T.mean(axis=0)
Out[37]:
In [38]:
ax = fast.drop(['total','days_to_90p','Published','month_published','year_published',
"month_year_published"],axis=1).mean(axis=0).plot()
ax.set(xlabel='Days Since Publication',ylabel ='Average Daily Page Views',title = 'Average Daily Traffic for Content with short halflife')
Out[38]:
In [39]:
ax = slow.drop(['total','days_to_90p','Published','month_published','year_published',
"month_year_published"],axis=1).mean(axis=0).plot()
ax.set(xlabel='Days Since Publication',ylabel ='Average Daily Page Views',title = 'Average Daily Traffic for Content with long halflife')
Out[39]:
In [40]:
slow.year_published.value_counts()
Out[40]:
In [41]:
fast.year_published.value_counts()
Out[41]:
In [42]:
ax = slow[slow.year_published == 2015].drop(['total','days_to_90p','Published','month_published','year_published',
"month_year_published"],axis=1).mean(axis=0).plot()
ax.set(xlabel='Days Since Publication',ylabel ='Average Daily Page Views',title = 'Average Daily Traffic for Content with long halflife 2015 ')
Out[42]:
In [43]:
ax = fast[fast.year_published == 2015].drop(['total','days_to_90p','Published','month_published','year_published',
"month_year_published"],axis=1).mean(axis=0).plot()
ax.set(xlabel='Days Since Publication',ylabel ='Average Daily Page Views',title = 'Average Daily Traffic for Content with short halflife 2015 ')
Out[43]:
In [44]:
df['fast_or_slow'] = ['fast' if x < 50 else 'slow' for x in df.days_to_90p]
In [ ]:
In [45]:
plt.figure(figsize=(20,10))
ax = sns.pointplot(x=pd.to_datetime(df.month_year_published),y='days_to_90p',hue='fast_or_slow',data=df)
ax.set_xticklabels(ax.get_xticklabels(),rotation=30)
plt.show()
In [63]:
df_ninety = df.iloc[:,:92]
In [64]:
df_ninety.head(1)
Out[64]:
In [69]:
df_ninety['days_to_90p'] = [(df_ninety.iloc[x].expanding().sum() > df_ninety.iloc[x].sum()*.90).argmax()\
for x in range(len(df_ninety))]
In [43]:
df_ninety['total'] = df_ninety.iloc[:,:92].sum(axis=1)
Out[43]:
In [68]:
df_ninety.dtypes
Out[68]:
In [ ]: